import pandas as pd
import numpy as np
import scipy
import scipy.stats as st
from scipy import stats


samp = np.array([ 20.8, 18.7, 19.9, 20.6, 21.9, 23.4, 22.8, 24.9, 22.2, 20.3, 24.9, 22.3,
                27.0, 20.3, 22.2, 24.0, 21.1, 22.1, 22.0,22.7])


tset, pval = stats.ttest_1samp(samp, 20, alternative='two-sided')
print('p-values',pval)
if pval < 0.06:    # alpha value is 0.06 or 6%
   print("we are rejecting null hypothesis")
else:
  print("we are accepting null hypothesis")

p-values 7.739940216301966e-05
we are rejecting null hypothesis


df1 = pd.read_csv('BodyFatPercentage.csv')
df1.head()


df1["mean"] = df1.iloc[:,1:].mean(axis = 1)
df1.head()


def mean_confidence_interval(data, confidence=0.85):
    a = 1.0 * np.array(data)
    n = len(a)  
    m, se = np.mean(a), scipy.stats.sem(a)  
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)   
    return m-h, m+h


ci = df1.iloc[:,1:-1].apply(mean_confidence_interval, axis = 1) 
low = [i[0] for i in ci]
high = [i[1] for i in ci] 
df1["low"] = low 
df1["high"] = high


df1.head()


# The confidence interval varies in each sample of the data


df2 = pd.read_csv('DartsVersusExperts.csv')
df2.head()


df2['Expert_Mean'] = df2.iloc[:,2:6].mean(axis = 1)
df2.head()


tset, pval = stats.ttest_ind(df2['Expert_Mean'], df2['DJIA'], alternative='less')
print('p-values',pval)
if pval < 0.06:    # alpha value is 0.06 or 6%
   print("Experts do outperform the Dow Jones Industrial Average")
else:
  print("Experts do not outperform the Dow Jones Industrial Average")

p-values 0.981305801435415
Experts do not outperform the Dow Jones Industrial Average


df3 = pd.read_csv('PlanoVersusPlusLenses.csv')
df3.head()


df3["mean"] = df3.iloc[:,4:6].mean(axis = 1)
df3.head()


def mean_confidence_interval(data, confidence=0.99):
    a = 1.0 * np.array(data)
    n = len(a)  
    m, se = np.mean(a), scipy.stats.sem(a)  
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)   
    return m-h, m+h


ci = df3.iloc[:,4:6].apply(mean_confidence_interval, axis = 1) 
low = [i[0] for i in ci]
high = [i[1] for i in ci] 
df3["low"] = low 
df3["high"] = high


df1.head()


# Function for computing confidence intervals
from statsmodels.stats.proportion import proportion_confint   

proportion_confint(count=15,    # Number of "successes"
                   nobs=34,    # Number of trials
                   alpha=(1 - 0.95))
# Alpha, which is 1 minus the confidence level

C:\Users\shahz\AppData\Roaming\Python\Python39\site-packages\statsmodels\compat\pandas.py:61: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
  from pandas import Int64Index as NumericIndex

(0.2742780812717178, 0.6080748599047527)


# The confidence interval varies from (0.2742780812717178 to 0.6080748599047527)


# Performs the test just described
from statsmodels.stats.proportion import proportions_ztest
res = proportions_ztest(count=15,
                        nobs=34,
                        value=0.75,  # The hypothesized value of population proportion p
                        alternative='two-sided') # Tests the "not equal to" alternative hypothesis

res

(-3.62665570173781, 0.00028711571884950185)


df1.head()


var1 = df1['AGE'].to_numpy
var2 = df1['BODYFAT'].to_numpy


import scipy.stats
from scipy.stats import pearsonr


stat, p = scipy.stats.pearsonr(df1['AGE'], df1['BODYFAT'])
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.01:
    print('Probably independent')
else:
    print('Probably dependent')

stat=0.289, p=0.000
Probably dependent


n = len(df1)
r = round(stat,2)


import math


def r_to_z(r):
    return math.log((1 + r) / (1 - r)) / 2.0

def fisher_z_confidence_interval(r, alpha, n):
    z = r_to_z(r)
    se = 1.0 / math.sqrt(n - 3)
    z_crit = stats.norm.ppf((1 + alpha)/2)  # 2-tailed z critical value

    lo = z - z_crit * se
    hi = z + z_crit * se

    # Return a sequence
    return (lo, hi)


fisher_z_confidence_interval(r, 0.80, n)

(0.21735123312185609, 0.3797812941985007)


# The interval varies from (0.20878496946167766 to 0.3712150305383223)


df4 = pd.read_csv('AgrestiFinlayCrime.csv')
df4.head()


df4['crime'].var == 185000

False


df4['crime'].var()

194569.4949019608


p = 1 - stats.chi2.cdf(df4['crime'].var(), 1)


if p < 0.1:
    print('Rejecting Null Hypothesis')
else:
    print('Accepting Null Hypothesis')

Rejecting Null Hypothesis


df5 = pd.read_csv('Abalone.csv')
df5.head()


#define sample data
data = df5['LENGTH']

#create 90% confidence interval for population variance length
st.t.interval(alpha=0.90, df=len(data)-1, loc=np.var(data), scale=st.variation(data))

(576.3771744111087, 577.1312141794878)


# The interval varies between (576.3771744111087, 577.1312141794878)


df6 = pd.read_csv('AAPLSP50020022006.csv')
df6.head()


df7 = pd.read_csv('AAPLSP50020072011.csv')
df7.head()


return1 = df6[['AAPL','SP500']].pct_change(1)
return1 = return1.iloc[:,0:].mean(axis = 1)
return1 = return1.fillna(0)
return1.head()

0    0.000000
1   -0.071715
2    0.063740
3   -0.017975
4   -0.024408
dtype: float64


return2 = df7[['AAPL','SP500']].pct_change(1)
return2 = return2.iloc[:,0:].mean(axis = 1)
return2 = return1.fillna(0)
return2.head()

0    0.000000
1   -0.071715
2    0.063740
3   -0.017975
4   -0.024408
dtype: float64


tset, pval = stats.ttest_ind(return1, return2, alternative='two-sided')
print('p-values',pval)
if pval < 0.05:    # alpha value is 0.05 or 5%
   print("Null Hypothesis Rejected")
else:
  print("Null Hypothesis Accepted")

p-values 1.0
Null Hypothesis Accepted


df8 = pd.read_csv('EffectOfGenderBodyTemperaturesAndRestingHeartRate.csv')
df8.head()


male_df = df8[df8['Gender'] == 1]
female_df = df8[df8['Gender'] == 0]


T1 = male_df['BodyTemp'].to_numpy() 
T2 = female_df['BodyTemp'].to_numpy()


T1.var()

0.4807479289940827


T2.var()

0.5442698224852062


tset, pval = stats.ttest_ind(T1, T2, equal_var=False)
print('p-values',pval)
if pval < 0.05:    # alpha value is 0.05 or 5%
   print(" we are rejecting null hypothesis")
else:
  print("we are accepting null hypothesis")

p-values 0.023938264182940983
 we are rejecting null hypothesis


data = df8['Heart Rate']


mean_difference = abs(male_df['Heart Rate'].mean() - female_df['Heart Rate'].mean())
mean_difference

0.7846153846153925


st.norm.interval(alpha=0.90, loc=mean_difference, scale=st.sem(data))

(-0.23418244946297762, 1.8034132186937621)


# The confidence interval varies between (-0.23418244946297762, 1.8034132186937621)


mean1, sdt1 = 525.751, 107.121 # mean and standard deviation
mean2, sdt2 = 373.269, 67.498
s1 = np.random.normal(mean1, sdt1, 10)
s2 = np.random.normal(mean2, sdt2, 10)


s1.var()

7608.858771707371


s2.var()

2555.3009221915227


tset, pval = stats.ttest_ind(s1, s2, equal_var=False)
print('p-values',pval)
if pval < 0.01:    # alpha value is 0.01 or 1%
   print(" we are rejecting null hypothesis")
else:
  print("we are accepting null hypothesis")

p-values 0.00015927286944113057
 we are rejecting null hypothesis


true_mean = abs(mean1 - mean2)
true_mean

152.48199999999997


data = np.concatenate((s1, s2), axis=0)


st.norm.interval(alpha=0.90, loc=true_mean, scale=st.sem(data))

(110.62336670614155, 194.3406332938584)


# The confidence interval varies between (99.25218806532742, 205.7118119346725)


df9 = pd.read_csv('DailyReturnsForFiveStocks.csv')
df9.head()


XOM = df9['XOM']
HD = df9['HD']


XOM.var()

0.00039533504669103637


HD.var()

0.00045939887039631115


tset, pval = stats.ttest_ind(T1, T2, equal_var=False)
print('p-values',pval)
if pval < 0.1:    # alpha value is 0.1 or 10%
   print(" we are rejecting null hypothesis")
else:
  print("we are accepting null hypothesis")

p-values 0.023938264182940983
 we are rejecting null hypothesis


# Our two samples are XOM and HD
sample1 = XOM.tolist()
sample2 = HD.tolist()


# The degrees of freedom in each sample is its length minus 1
sample1_df = len(sample1) - 1
sample2_df = len(sample2) - 1

# Compute the ratio of the variances
import statistics
ratio = statistics.variance(sample1) / statistics.variance(sample2)

# Find the critical values from the F-distribution
from scipy import stats
alpha = 0.05       # replace with your chosen alpha (here, a 95% confidence level)
lower_critical_value = 1 / stats.f.ppf(q = 1 - alpha/2, dfn = sample1_df, dfd = sample2_df)
upper_critical_value = stats.f.ppf(q = 1 - alpha/2, dfn = sample2_df, dfd = sample1_df)

# Compute the confidence interval
lower_bound = ratio * lower_critical_value
upper_bound = ratio * upper_critical_value
lower_bound, upper_bound

(0.7628403250668637, 0.9707717832865929)


# The confidence interval varies between (0.7628403250668637, 0.9707717832865929)


df10 = pd.read_csv('EffectOfSeeingAYawnOnYawning.csv')
df10.head()


df10['Yawn'].replace("Didn't Yawn",0,inplace=True)
df10['Yawn'].replace('Yawned',1,inplace=True)


with_seed = df10[df10['Seeded'] == 'Yawn Seeded'] 
without_seed = df10[df10['Seeded'] == 'Not Seeded']


tset, pval = stats.ttest_ind(with_seed['Yawn'], without_seed['Yawn'], equal_var=True)
print('p-values',pval)
if pval < 0.1:    # alpha value is 0.1 or 10%
   print(" we are rejecting null hypothesis")
else:
  print("we are accepting null hypothesis")

p-values 0.7519489486672732
we are accepting null hypothesis


df11 = pd.read_csv('EffectOfSeeingAYawnOnYawningOnTortoises.csv')
df11.head()


df11['Yawning behaviour'].replace("None",0,inplace=True)
df11['Yawning behaviour'].replace('Yawned',1,inplace=True)


with_stimulus = df11[df11['Presence of stimulus'] == 'Yes'] 
without_stimulus = df11[df11['Presence of stimulus'] == 'No']


mean1 = with_stimulus['Yawning behaviour'].mean()
mean2 = without_stimulus['Yawning behaviour'].mean()


true_mean = abs(mean1 - mean2)
true_mean

0.1388888888888889


data = df11['Yawning behaviour']


st.norm.interval(alpha=0.80, loc=true_mean, scale=st.sem(data))

(0.07050863182565573, 0.20726914595212206)


# The confidence interval varies between (0.07050863182565573, 0.20726914595212206)


df12 = pd.read_csv('MotherEducationBirthWeight.csv')
df12.head()


var1 = df12['YEARSEDUC']
var2 = df12['BIRTHWEIGHT']


stat, p = scipy.stats.pearsonr(var1,var2)
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
    print('Probably independent')
else:
    print('Probably dependent')

stat=0.090, p=0.156
Probably independent


to_shuffle = df12['YEARSEDUC']
to_shuffle = to_shuffle.sample(frac = 1)
to_shuffle = to_shuffle.reset_index(drop = True)
to_shuffle.head()

0    12
1    17
2    12
3    12
4    12
Name: YEARSEDUC, dtype: int64


df12['YEARSEDUC_PERMUTED'] = to_shuffle


df12.head()


var3 = df12['YEARSEDUC_PERMUTED']


stat, p = scipy.stats.pearsonr(var3,var2)
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
    print('Probably independent')
else:
    print('Probably dependent')

stat=0.042, p=0.504
Probably independent

	IDNO	BODYFAT	DENSITY	AGE	WEIGHT	HEIGHT	NECK	CHEST	ABDOMEN	HIP	THIGH	KNEE	ANKLE	BICEPS	FOREARM	WRIST
0	1	12.6	1.0708	23	154.25	67.75	36.2	93.1	85.2	94.5	59.0	37.3	21.9	32.0	27.4	17.1
1	2	6.9	1.0853	22	173.25	72.25	38.5	93.6	83.0	98.7	58.7	37.3	23.4	30.5	28.9	18.2
2	3	24.6	1.0414	22	154.00	66.25	34.0	95.8	87.9	99.2	59.6	38.9	24.0	28.8	25.2	16.6
3	4	10.9	1.0751	26	184.75	72.25	37.4	101.8	86.4	101.2	60.1	37.3	22.8	32.4	29.4	18.2
4	5	27.8	1.0340	24	184.25	71.25	34.4	97.3	100.0	101.9	63.2	42.2	24.0	32.2	27.7	17.7

	IDNO	BODYFAT	DENSITY	AGE	WEIGHT	HEIGHT	NECK	CHEST	ABDOMEN	HIP	THIGH	KNEE	ANKLE	BICEPS	FOREARM	WRIST	mean
0	1	12.6	1.0708	23	154.25	67.75	36.2	93.1	85.2	94.5	59.0	37.3	21.9	32.0	27.4	17.1	50.824720
1	2	6.9	1.0853	22	173.25	72.25	38.5	93.6	83.0	98.7	58.7	37.3	23.4	30.5	28.9	18.2	52.419020
2	3	24.6	1.0414	22	154.00	66.25	34.0	95.8	87.9	99.2	59.6	38.9	24.0	28.8	25.2	16.6	51.859427
3	4	10.9	1.0751	26	184.75	72.25	37.4	101.8	86.4	101.2	60.1	37.3	22.8	32.4	29.4	18.2	54.798340
4	5	27.8	1.0340	24	184.25	71.25	34.4	97.3	100.0	101.9	63.2	42.2	24.0	32.2	27.7	17.7	56.595600

	IDNO	BODYFAT	DENSITY	AGE	WEIGHT	HEIGHT	NECK	CHEST	ABDOMEN	HIP	THIGH	KNEE	ANKLE	BICEPS	FOREARM	WRIST	mean	low	high
0	1	12.6	1.0708	23	154.25	67.75	36.2	93.1	85.2	94.5	59.0	37.3	21.9	32.0	27.4	17.1	50.824720	34.572313	67.077127
1	2	6.9	1.0853	22	173.25	72.25	38.5	93.6	83.0	98.7	58.7	37.3	23.4	30.5	28.9	18.2	52.419020	34.549177	70.288863
2	3	24.6	1.0414	22	154.00	66.25	34.0	95.8	87.9	99.2	59.6	38.9	24.0	28.8	25.2	16.6	51.859427	35.530650	68.188204
3	4	10.9	1.0751	26	184.75	72.25	37.4	101.8	86.4	101.2	60.1	37.3	22.8	32.4	29.4	18.2	54.798340	35.896462	73.700218
4	5	27.8	1.0340	24	184.25	71.25	34.4	97.3	100.0	101.9	63.2	42.2	24.0	32.2	27.7	17.7	56.595600	37.843712	75.347488

	Month	Year	Expert #1	Expert #2	Expert #3	Expert #4	Dart #1	Dart #2	Dart #3	Dart #4	DJIA	Average	Diff
0	Jul	1990	51.6	8.0	7.7	-16.7	53.8	-10.2	-8.6	-35.0	2.5	12.650	10.150
1	Aug	1990	56.7	37.8	27.8	-16.7	36.7	-3.7	-3.9	-22.0	11.5	26.400	14.900
2	Sep	1990	29.8	4.6	-9.4	-14.9	6.8	-9.8	-11.3	-42.9	-2.3	2.525	4.825
3	Oct	1990	-13.7	-18.2	-19.4	-28.6	44.4	-9.0	-20.3	-44.0	-9.2	-19.975	-10.775
4	Nov	1990	25.8	-39.8	-40.4	-96.9	12.9	-9.8	-31.4	-37.1	-8.5	-37.825	-29.325

	Month	Year	Expert #1	Expert #2	Expert #3	Expert #4	Dart #1	Dart #2	Dart #3	Dart #4	DJIA	Average	Diff	Expert_Mean
0	Jul	1990	51.6	8.0	7.7	-16.7	53.8	-10.2	-8.6	-35.0	2.5	12.650	10.150	12.650
1	Aug	1990	56.7	37.8	27.8	-16.7	36.7	-3.7	-3.9	-22.0	11.5	26.400	14.900	26.400
2	Sep	1990	29.8	4.6	-9.4	-14.9	6.8	-9.8	-11.3	-42.9	-2.3	2.525	4.825	2.525
3	Oct	1990	-13.7	-18.2	-19.4	-28.6	44.4	-9.0	-20.3	-44.0	-9.2	-19.975	-10.775	-19.975
4	Nov	1990	25.8	-39.8	-40.4	-96.9	12.9	-9.8	-31.4	-37.1	-8.5	-37.825	-29.325	-37.825

Statistical Data Analysis

Hypothesis Testing and Confidence Interval

Question 1:¶

Hypothesis:¶

Question 2:¶

Question 3:¶

Hypothesis:¶

Question 4:¶

Why is a paired approach appropriate¶

Question 5:¶

Question 6:¶

Question 7:¶

Question 8:¶

Question 9:¶

Hypothesis:¶

Question 10:¶

Question 11:¶

Hypothesis¶

Question 12¶

Question 13:¶

Question 14:¶

Question 15:¶

Question 16:¶

Question 17:¶

Question 18:¶

Question 19:¶

Question 20:¶

Question 20(a).¶

Question 20(b):¶

Question 20(c).¶

	ID	Refraction	Plano	Plus	Comp_Plano	Comp_Plus	d
0	1	Myopia	208	248	65	70	5.0
1	2	Myopia	314	386	65	90	25.0
2	3	Myopia	177	169	75	70	-5.0
3	4	Myopia	236	244	85	75	-10.0
4	5	Myopia	178	220	70	75	5.0

	ID	Refraction	Plano	Plus	Comp_Plano	Comp_Plus	d	mean
0	1	Myopia	208	248	65	70	5.0	67.5
1	2	Myopia	314	386	65	90	25.0	77.5
2	3	Myopia	177	169	75	70	-5.0	72.5
3	4	Myopia	236	244	85	75	-10.0	80.0
4	5	Myopia	178	220	70	75	5.0	72.5

	sid	state	crime	murder	pctmetro	pctwhite	pcths	poverty	single
0	1	ak	761	9.0	41.799999	75.199997	86.599998	9.100000	14.3
1	2	al	780	11.6	67.400002	73.500000	66.900002	17.400000	11.5
2	3	ar	593	10.2	44.700001	82.900002	66.300003	20.000000	10.7
3	4	az	715	8.6	84.699997	88.599998	78.699997	15.400000	12.1
4	5	ca	1078	13.1	96.699997	79.300003	76.199997	18.200001	12.5

	SEX	LENGTH	DIAMETER	HEIGHT	WHOLEWEIGHT	SHUCKEDWEIGHT	VISCERAWEIGHT	SHELLWEIGHT	RINGS	AGE
0	M	91	73	19	102.8	44.9	20.2	30.0	15	16.5
1	M	70	53	18	45.1	19.9	9.7	14.0	7	8.5
2	F	106	84	27	135.4	51.3	28.3	42.0	9	10.5
3	M	88	73	25	103.2	43.1	22.8	31.0	10	11.5
4	I	66	51	16	41.0	17.9	7.9	11.0	7	8.5

	Time	AAPL	SP500	rf
0	1/31/02	12.31	1130.20	1.72
1	2/28/02	10.80	1106.73	1.72
2	3/28/02	11.78	1147.39	1.74
3	4/30/02	12.08	1076.92	1.73
4	5/31/02	11.60	1067.14	1.71

	Time	AAPL	SP500	rf
0	1/31/2007	85.36	1438.24	4.97
1	2/28/2007	84.25	1406.82	4.99
2	3/30/2007	92.51	1420.86	4.89
3	4/30/2007	99.37	1482.37	4.72
4	5/31/2007	120.67	1530.62	4.59

	Date	IBM	MSFT	HD	XOM	AAPL
0	6/3/2008	0.003732	-0.017780	0.005915	-0.023975	-0.003886
1	6/4/2008	-0.002282	0.008447	0.002940	0.000517	-0.000975
2	6/5/2008	0.007284	0.027523	0.027219	0.041478	0.022885
3	6/6/2008	-0.027497	-0.028339	-0.026498	-0.028164	-0.019987
4	6/9/2008	0.007350	0.007990	-0.023451	0.026171	-0.021747

	Seeded	Yawn
0	Yawn Seeded	Yawned
1	Yawn Seeded	Yawned
2	Yawn Seeded	Yawned
3	Yawn Seeded	Yawned
4	Yawn Seeded	Yawned

	BodyTemp	Gender	Heart Rate
0	96.3	1	70
1	96.7	1	71
2	96.9	1	74
3	97.0	1	80
4	97.1	1	73

	Presence of stimulus	Yawning behaviour
0	Yes	Yawned
1	Yes	Yawned
2	Yes	Yawned
3	Yes	Yawned
4	Yes	Yawned

	YEARSEDUC	BIRTHWEIGHT
0	11	1733
1	12	1932
2	12	4233
3	11	2977
4	13	2617

	YEARSEDUC	BIRTHWEIGHT	YEARSEDUC_PERMUTED
0	11	1733	12
1	12	1932	17
2	12	4233	12
3	11	2977	12
4	13	2617	12